Reading the data
loving_daily <- readRDS("../DataProcessing/Trailer_daily_merge.rds")
loving_hourly <- readRDS("../DataProcessing/Trailer_hourly_merge.rds")
# making NA count to 0
loving_daily$count[is.na(loving_daily$count)] <- 0
loving_hourly$count[is.na(loving_hourly$count)] <- 0
loving_daily$weighted.count[is.na(loving_daily$weighted.count)] <- 0
loving_hourly$weighted.count[is.na(loving_hourly$weighted.count)] <- 0
# none flare within 100km
loving_daily$closest.flare[is.na(loving_daily$closest.flare)] <- 100
loving_hourly$closest.flare[is.na(loving_hourly$closest.flare)] <- 100
# creating a dataframe without the flares data
loving_daily_noflares <- loving_daily %>% select(-c(temp_bb,rhi, esf_bb, distToLovi,inv_dist))
loving_hourly_noflares <- loving_hourly %>% select(-c(temp_bb,rhi, esf_bb, distToLovi,inv_dist))
# Helper Function to find correlated variables above a threshold
correlated_above_threshold <- function(cor_matrix, var_name, threshold, response) {
# Get the absolute values of correlations for the variable
cor_values <- abs(cor_matrix[var_name, ])
# Find the variables above the threshold
above_threshold <- cor_values >= threshold
# Get the names of the correlated variables above the threshold
correlated_names <- setdiff(names(above_threshold[above_threshold]), "rd_particle_pCi")
# Get the correlation scores with response
correlated_scores<- cor_matrix[response, correlated_names]
return (list(variables = correlated_names, scores = correlated_scores))
}
Modeling
Fitting on Daily
# First we do it for daily
# Define the response variable
response_var <- "rd_particle_pCi_mean"
# Define the names of the columns to exclude
exclude_cols <- c("rd_particle_pCi_mean", "rd_particle_B_mean", "radon_pCi_mean", "radon_B_mean",
"temp_bb", "rhi", "esf_bb", "distToLovi", "monthly_oil", "monthly_gas", "distToLovi_wells",
"inv_dist", "datetime")
# Generate the formula for the GAM model
formula_str_s <- paste(response_var, "~", paste(paste0("s(", setdiff(names(loving_daily), exclude_cols), ")"), collapse = "+"))
formula_str <- paste(response_var, "~", paste(setdiff(names(loving_daily), exclude_cols), collapse = "+"))
# Fit the GAM model
daily_v1 <- mgcv::gam(as.formula(paste0(formula_str, "+s(as.numeric(datetime))")), data = loving_daily_noflares)
# View the summary of the GAM model
summary(daily_v1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## rd_particle_pCi_mean ~ co_mean + no_mean + no2_mean + nox_mean +
## o3_mean + temp_f_mean + pressure_altcorr_mean + wsp_mean +
## wdr_mean + relh_mean + rain_mean + solr_mean + co2_ppm_mean +
## ch4_mean + h2o_sync_mean + ethane_mean + ethene_mean + propane_mean +
## propene_mean + X1_3.butadiene_mean + i.butane_mean + n.butane_mean +
## acetylene_mean + cyclopentane_mean + i.pentane_mean + n.pentane_mean +
## n.hexane_mean + isoprene_mean + n.heptane_mean + benzene_mean +
## n.octane_mean + toluene_mean + ethyl.benzene_mean + m.p.xylene_mean +
## o.xylene_mean + closest.flare + weighted.count + count +
## s(as.numeric(datetime))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.295e+00 1.962e+00 -0.660 0.51092
## co_mean 5.726e-04 2.288e-04 2.502 0.01411 *
## no_mean 6.996e-01 1.141e+00 0.613 0.54136
## no2_mean 7.009e-01 1.142e+00 0.614 0.54083
## nox_mean -6.998e-01 1.141e+00 -0.613 0.54117
## o3_mean -1.999e-03 7.262e-04 -2.753 0.00713 **
## temp_f_mean 5.671e-04 1.732e-03 0.327 0.74406
## pressure_altcorr_mean 1.114e-03 1.791e-03 0.622 0.53532
## wsp_mean -7.741e-03 5.335e-03 -1.451 0.15024
## wdr_mean 2.893e-04 1.680e-04 1.722 0.08845 .
## relh_mean -2.417e-04 1.339e-03 -0.180 0.85720
## rain_mean -5.656e-01 8.385e-01 -0.675 0.50168
## solr_mean -7.362e-05 9.068e-05 -0.812 0.41893
## co2_ppm_mean 6.934e-05 1.637e-03 0.042 0.96631
## ch4_mean 8.629e-05 2.981e-05 2.894 0.00475 **
## h2o_sync_mean -1.660e-02 3.394e-02 -0.489 0.62585
## ethane_mean 3.654e-04 8.158e-04 0.448 0.65532
## ethene_mean -1.215e-02 8.301e-03 -1.463 0.14681
## propane_mean 3.276e-04 1.430e-03 0.229 0.81927
## propene_mean -2.276e-01 2.507e-01 -0.908 0.36649
## X1_3.butadiene_mean 3.360e+00 2.200e+00 1.527 0.13017
## i.butane_mean 5.420e-03 7.689e-03 0.705 0.48267
## n.butane_mean -1.787e-03 5.682e-03 -0.314 0.75393
## acetylene_mean 2.676e-02 6.328e-02 0.423 0.67340
## cyclopentane_mean 2.538e-01 2.242e-01 1.132 0.26047
## i.pentane_mean -1.477e-03 2.158e-02 -0.068 0.94558
## n.pentane_mean -8.861e-03 3.514e-02 -0.252 0.80150
## n.hexane_mean -3.610e-02 8.036e-02 -0.449 0.65434
## isoprene_mean 1.064e+00 1.357e+00 0.784 0.43523
## n.heptane_mean 4.678e-02 1.019e-01 0.459 0.64720
## benzene_mean -1.529e-01 8.498e-02 -1.799 0.07534 .
## n.octane_mean -2.564e-01 1.142e-01 -2.246 0.02713 *
## toluene_mean 2.738e-01 1.058e-01 2.588 0.01123 *
## ethyl.benzene_mean -1.207e-01 8.005e-02 -1.508 0.13497
## m.p.xylene_mean 2.143e-01 1.271e-01 1.685 0.09532 .
## o.xylene_mean -5.914e-01 4.537e-01 -1.303 0.19575
## closest.flare 4.568e-04 5.361e-04 0.852 0.39637
## weighted.count 1.893e-02 1.701e-02 1.113 0.26866
## count -2.793e-05 2.600e-04 -0.107 0.91471
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(datetime)) 1.744 2.278 1.13 0.393
##
## R-sq.(adj) = 0.716 Deviance explained = 80.2%
## GCV = 0.0020141 Scale est. = 0.0013924 n = 132
plot(daily_v1)

# using leaps and regsubset to search for best predictors
# exhaustive
# not including flares and oil&gas variables
daily_exhaust = regsubsets(
rd_particle_pCi_mean ~.-rd_particle_B_mean-radon_pCi_mean-radon_B_mean-monthly_oil-monthly_gas-distToLovi_wells,
data = loving_daily_noflares, nvmax=30, method = "exhaustive")
reg_summary = summary(daily_exhaust)
par(mfrow = c(2,2))
plot(reg_summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(reg_summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")
# We will now plot a red dot to indicate the model with the largest adjusted R^2 statistic.
# The which.max() function can be used to identify the location of the maximum point of a vector
adj_r2_max = which.max(reg_summary$adjr2) #15
# The points() command works like the plot() command, except that it puts points
# on a plot that has already been created instead of creating a new plot
points(adj_r2_max, reg_summary$adjr2[adj_r2_max], col ="red", cex = 2, pch = 20)
# We'll do the same for C_p and BIC, this time looking for the models with the SMALLEST statistic
plot(reg_summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
cp_min = which.min(reg_summary$cp) # 9
points(cp_min, reg_summary$cp[cp_min], col = "red", cex = 2, pch = 20)
plot(reg_summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
bic_min = which.min(reg_summary$bic) # 8
points(bic_min, reg_summary$bic[bic_min], col = "red", cex = 2, pch = 20)

# checking coefficients of the variables selected using adjusted R^2
plot(daily_exhaust, scale = "adjr2")

coef(daily_exhaust, 15)
## (Intercept) co_mean o3_mean wsp_mean
## -7.866126e-01 2.150247e-04 -1.503669e-03 -8.404306e-03
## co2_ppm_mean ch4_mean h2o_sync_mean ethene_mean
## 2.001216e-03 8.697424e-05 -3.539654e-02 -1.234194e-02
## cyclopentane_mean i.pentane_mean n.pentane_mean isoprene_mean
## 2.104118e-01 2.216484e-02 -3.417823e-02 -1.016855e+00
## n.octane_mean toluene_mean m.p.xylene_mean o.xylene_mean
## -7.835622e-02 1.078132e-01 -1.741389e-01 5.223981e-01
# checking coefficients of the variables selected using Cp
plot(daily_exhaust, scale = "Cp")

coef(daily_exhaust, 9)
## (Intercept) o3_mean wsp_mean co2_ppm_mean
## -1.015490e+00 -1.328889e-03 -1.079038e-02 2.560781e-03
## ch4_mean h2o_sync_mean ethene_mean cyclopentane_mean
## 9.376348e-05 -3.062397e-02 -1.221147e-02 1.821697e-01
## n.hexane_mean toluene_mean
## -4.190251e-02 6.349890e-02
# checking coefficients of the variables selected using BIC
plot(daily_exhaust, scale = "bic")

coef(daily_exhaust, 8)
## (Intercept) o3_mean wsp_mean co2_ppm_mean ch4_mean
## -1.154038e+00 -1.255093e-03 -1.118515e-02 2.910978e-03 8.784447e-05
## h2o_sync_mean ethene_mean n.heptane_mean toluene_mean
## -2.735884e-02 -1.246049e-02 -5.105644e-02 1.064432e-01
# Fitting a Gam on the BIC selected variables
daily_v2 <- mgcv::gam(rd_particle_pCi_mean ~ co_mean + o3_mean + wsp_mean + co2_ppm_mean + ch4_mean + h2o_sync_mean + ethene_mean + i.pentane_mean + n.pentane_mean + n.octane_mean + toluene_mean + m.p.xylene_mean + o.xylene_mean, data = loving_daily_noflares)
# View the summary of the GAM model
summary(daily_v2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## rd_particle_pCi_mean ~ co_mean + o3_mean + wsp_mean + co2_ppm_mean +
## ch4_mean + h2o_sync_mean + ethene_mean + i.pentane_mean +
## n.pentane_mean + n.octane_mean + toluene_mean + m.p.xylene_mean +
## o.xylene_mean
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.618e-02 5.279e-01 0.087 0.930437
## co_mean 3.449e-04 1.615e-04 2.135 0.034813 *
## o3_mean -3.343e-03 6.082e-04 -5.496 2.27e-07 ***
## wsp_mean -1.622e-02 4.568e-03 -3.550 0.000554 ***
## co2_ppm_mean 7.468e-05 1.232e-03 0.061 0.951757
## ch4_mean 1.102e-04 2.391e-05 4.610 1.03e-05 ***
## h2o_sync_mean -1.425e-02 8.571e-03 -1.663 0.098967 .
## ethene_mean -1.003e-02 7.043e-03 -1.424 0.156991
## i.pentane_mean 1.158e-02 1.264e-02 0.916 0.361291
## n.pentane_mean -1.632e-02 1.142e-02 -1.429 0.155579
## n.octane_mean -1.239e-01 6.259e-02 -1.979 0.050097 .
## toluene_mean 2.202e-01 4.499e-02 4.894 3.16e-06 ***
## m.p.xylene_mean 1.081e-01 9.107e-02 1.187 0.237489
## o.xylene_mean -7.793e-01 3.634e-01 -2.145 0.034035 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.671 Deviance explained = 70.4%
## GCV = 0.0018036 Scale est. = 0.0016123 n = 132
# Fit the GAM model with ridge regression
daily_v2 <- mgcv::gam(as.formula(
paste0(formula_str, "+s(as.numeric(as.Date(datetime)))")
), data = loving_daily, method = "REML", select = TRUE, family = gaussian,
control = mgcv::gam.control(maxit = 100, trace = TRUE))
# View the summary of the GAM model
summary(daily_v2)
# correlation matrix
cor_matrix = cor(loving_daily_noflares %>% select(-c("radon_B_mean", "rd_particle_B_mean","radon_pCi_mean","rd_particle_pCi_mean", "datetime", "monthly_oil", "monthly_gas", "distToLovi_wells")) %>% na.omit())
corrplot.mixed(cor_matrix, order="AOE", number.cex = 0.5, tl.pos = 'lt')

Fitting on Hourly
# For hourly
# Define the response variable
response_var <- "rd_particle_pCi"
# Define the names of the columns to exclude
exclude_cols <- c("radon_pCi", "rd_particle_B", "rd_particle_pCi", "radon_B",
"t_pump_f", "temp_bb", "rhi", "esf_bb", "distToLovi", "inv_dist",
"datetime", "monthly_oil", "monthly_gas", "distToLovi_wells",
"time_hourly")
# Generate the formula for the GAM model
formula_str_s <- paste(response_var, "~", paste(paste0("s(", setdiff(names(loving_hourly), exclude_cols), ")"), collapse = "+"))
formula_str <- paste(response_var, "~", paste(setdiff(names(loving_hourly), exclude_cols), collapse = "+"))
# Fit the GAM model
hourly_v1 <- mgcv::gam(as.formula(
paste0(formula_str, "+s(as.numeric(datetime))", "+s(as.numeric(substr(time_hourly, 12, 13)))")
), data = loving_hourly)
# View the summary of the GAM model
summary(hourly_v1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## rd_particle_pCi ~ co + no + no2 + nox + o3 + temp_f + pressure_altcorr +
## wsp + wdr + relh + rain + solr + co2_ppm + ch4 + h2o_sync +
## ethane + ethene + propane + propene + X1_3.butadiene + i.butane +
## n.butane + acetylene + cyclopentane + i.pentane + n.pentane +
## n.hexane + isoprene + n.heptane + benzene + n.octane + toluene +
## ethyl.benzene + m.p.xylene + o.xylene + hour + closest.flare +
## weighted.count + count + s(as.numeric(datetime)) + s(as.numeric(substr(time_hourly,
## 12, 13)))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.004e+00 1.828e-01 -5.493 4.32e-08 ***
## co 4.258e-04 4.226e-05 10.076 < 2e-16 ***
## no -4.282e-01 1.269e-01 -3.373 0.000754 ***
## no2 -4.334e-01 1.270e-01 -3.414 0.000650 ***
## nox 4.290e-01 1.269e-01 3.379 0.000737 ***
## o3 -8.641e-04 1.680e-04 -5.143 2.90e-07 ***
## temp_f -8.716e-04 4.040e-04 -2.157 0.031066 *
## pressure_altcorr 1.757e-03 6.195e-04 2.836 0.004608 **
## wsp -1.089e-03 9.101e-04 -1.196 0.231745
## wdr 1.139e-04 2.222e-05 5.126 3.17e-07 ***
## relh -2.077e-03 2.797e-04 -7.427 1.48e-13 ***
## rain 1.328e-02 6.527e-02 0.203 0.838801
## solr 3.290e-05 1.237e-05 2.660 0.007852 **
## co2_ppm 4.469e-03 2.828e-04 15.804 < 2e-16 ***
## ch4 5.259e-05 4.294e-06 12.246 < 2e-16 ***
## h2o_sync 1.781e-02 8.297e-03 2.146 0.031950 *
## ethane 3.356e-04 7.293e-05 4.602 4.38e-06 ***
## ethene -2.317e-03 3.166e-03 -0.732 0.464320
## propane -8.649e-05 1.066e-04 -0.812 0.417147
## propene -6.952e-02 2.981e-02 -2.332 0.019757 *
## X1_3.butadiene 5.622e-01 2.176e-01 2.583 0.009833 **
## i.butane -5.949e-04 6.778e-04 -0.878 0.380245
## n.butane -9.720e-05 4.674e-04 -0.208 0.835273
## acetylene 4.057e-03 7.783e-03 0.521 0.602217
## cyclopentane -5.389e-02 2.127e-02 -2.534 0.011335 *
## i.pentane -2.713e-03 1.800e-03 -1.508 0.131715
## n.pentane 5.924e-03 2.839e-03 2.087 0.037019 *
## n.hexane -3.552e-03 6.430e-03 -0.552 0.580693
## isoprene 1.120e-01 1.432e-01 0.782 0.434165
## n.heptane 8.550e-03 1.130e-02 0.757 0.449409
## benzene -2.187e-02 8.386e-03 -2.608 0.009164 **
## n.octane -8.600e-02 2.283e-02 -3.768 0.000168 ***
## toluene 5.509e-02 1.037e-02 5.311 1.18e-07 ***
## ethyl.benzene -7.573e-02 1.448e-02 -5.230 1.83e-07 ***
## m.p.xylene 1.269e-01 2.777e-02 4.568 5.15e-06 ***
## o.xylene -3.513e-02 6.694e-02 -0.525 0.599789
## hour -2.232e-01 4.208e-02 -5.304 1.22e-07 ***
## closest.flare 1.588e-04 2.048e-04 0.775 0.438208
## weighted.count 3.397e-03 6.338e-03 0.536 0.591977
## count 9.443e-05 9.724e-05 0.971 0.331600
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(datetime)) 8.864 8.994 43.54 <2e-16 ***
## s(as.numeric(substr(time_hourly, 12, 13))) 7.023 8.098 18.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 57/58
## R-sq.(adj) = 0.731 Deviance explained = 73.6%
## GCV = 0.0045911 Scale est. = 0.0044997 n = 2770
# using leaps and regsubset to search for best predictors
# exhaustive
# not including flares and oil&gas variables
# adding a hour variable first
loving_hourly_noflares <- loving_hourly_noflares %>% mutate(hour = as.numeric(substr(time_hourly, 12, 13)))
hourly_exhaust = regsubsets(
radon_pCi~.-time_hourly -rd_particle_B-rd_particle_pCi-radon_B-monthly_oil-monthly_gas-distToLovi_wells,
data = loving_hourly_noflares, nvmax=30, method = "exhaustive")
reg_summary = summary(hourly_exhaust)
par(mfrow = c(2,2))
plot(reg_summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(reg_summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")
# We will now plot a red dot to indicate the model with the largest adjusted R^2 statistic.
# The which.max() function can be used to identify the location of the maximum point of a vector
adj_r2_max = which.max(reg_summary$adjr2) #25
# The points() command works like the plot() command, except that it puts points
# on a plot that has already been created instead of creating a new plot
points(adj_r2_max, reg_summary$adjr2[adj_r2_max], col ="red", cex = 2, pch = 20)
# We'll do the same for C_p and BIC, this time looking for the models with the SMALLEST statistic
plot(reg_summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
cp_min = which.min(reg_summary$cp) # 20
points(cp_min, reg_summary$cp[cp_min], col = "red", cex = 2, pch = 20)
plot(reg_summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
bic_min = which.min(reg_summary$bic) # 16
points(bic_min, reg_summary$bic[bic_min], col = "red", cex = 2, pch = 20)

# checking coefficients of the variables selected using adjusted R^2
plot(hourly_exhaust, scale = "adjr2")

coef(hourly_exhaust, 25)
## (Intercept) datetime co no2
## -2.492603e+01 7.509371e-04 1.405440e-04 -5.323818e-03
## nox temp_f pressure_altcorr wsp
## 7.586770e-04 -5.556106e-03 5.237031e-03 -2.641061e-02
## wdr relh co2_ppm h2o_sync
## 4.765364e-04 -1.841294e-03 1.401657e-02 -2.407520e-02
## ethane propene X1_3.butadiene i.butane
## 8.290471e-04 -3.109479e-01 2.904302e+00 5.668343e-03
## n.butane acetylene cyclopentane n.pentane
## -3.851351e-03 -7.103973e-02 1.722275e-01 -4.858070e-03
## n.hexane benzene ethyl.benzene m.p.xylene
## -1.625355e-02 3.802089e-02 -9.486146e-02 2.056836e-01
## o.xylene hour
## -3.620882e-01 -2.159373e-03
# checking coefficients of the variables selected using Cp
plot(hourly_exhaust, scale = "Cp")

coef(hourly_exhaust, 20)
## (Intercept) datetime no2 nox
## -2.311603e+01 6.524077e-04 -5.304310e-03 9.193150e-04
## temp_f pressure_altcorr wsp wdr
## -6.255999e-03 5.316260e-03 -2.642356e-02 4.969661e-04
## relh co2_ppm ethane propene
## -2.616477e-03 1.425182e-02 8.758579e-04 -3.086046e-01
## X1_3.butadiene i.butane n.butane acetylene
## 2.769437e+00 5.300563e-03 -4.218410e-03 -6.892481e-02
## cyclopentane n.hexane n.heptane benzene
## 1.553230e-01 -3.506564e-02 4.037811e-02 4.005663e-02
## hour
## -2.116066e-03
# checking coefficients of the variables selected using adjusted R^2
plot(hourly_exhaust, scale = "bic")

coef(hourly_exhaust, 16)
## (Intercept) datetime no2 temp_f
## -2.282576e+01 6.382371e-04 -3.510055e-03 -6.145565e-03
## pressure_altcorr wsp wdr relh
## 5.269061e-03 -2.598156e-02 5.024433e-04 -2.553147e-03
## co2_ppm ethane propene X1_3.butadiene
## 1.430202e-02 9.307781e-04 -3.153754e-01 2.781602e+00
## i.butane n.butane acetylene benzene
## 3.971486e-03 -3.536060e-03 -7.738726e-02 6.578672e-02
## hour
## -2.165885e-03
# Fitting a Gam on the BIC selected variables
hourly_v2 <- mgcv::gam(radon_pCi~ s(as.numeric(datetime)) + no2 + temp_f + pressure_altcorr + wsp + wdr + relh + co2_ppm + ethane + propene + X1_3.butadiene + n.butane + acetylene + benzene +s(hour), data = loving_hourly_noflares)
# View the summary of the GAM model
summary(hourly_v2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## radon_pCi ~ s(as.numeric(datetime)) + no2 + temp_f + pressure_altcorr +
## wsp + wdr + relh + co2_ppm + ethane + propene + X1_3.butadiene +
## n.butane + acetylene + benzene + s(hour)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.019e+01 1.296e+00 -7.864 5.16e-15 ***
## no2 -1.015e-03 7.927e-04 -1.280 0.200645
## temp_f -7.077e-03 7.757e-04 -9.123 < 2e-16 ***
## pressure_altcorr 6.992e-03 1.214e-03 5.760 9.26e-09 ***
## wsp -2.550e-02 1.860e-03 -13.706 < 2e-16 ***
## wdr 5.575e-04 4.508e-05 12.367 < 2e-16 ***
## relh -3.898e-03 2.796e-04 -13.938 < 2e-16 ***
## co2_ppm 9.964e-03 5.543e-04 17.974 < 2e-16 ***
## ethane 8.467e-04 1.065e-04 7.949 2.65e-15 ***
## propene -2.243e-01 4.852e-02 -4.622 3.96e-06 ***
## X1_3.butadiene 2.167e+00 4.143e-01 5.230 1.81e-07 ***
## n.butane -7.977e-04 2.150e-04 -3.710 0.000211 ***
## acetylene -6.175e-02 1.560e-02 -3.959 7.70e-05 ***
## benzene 2.540e-02 7.996e-03 3.177 0.001505 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(datetime)) 8.862 8.994 20.663 <2e-16 ***
## s(hour) 6.605 7.768 8.651 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.608 Deviance explained = 61.1%
## GCV = 0.022872 Scale est. = 0.022646 n = 2977
plot(hourly_v2)


Exhaustive Search and removing correlated predictors
# correlation matrix
M_hourly = cor(loving_hourly_noflares %>% select(-c( "datetime", "monthly_oil", "monthly_gas", "distToLovi_wells",
"radon_B", "rd_particle_B", "radon_pCi","time_hourly")) %>% na.omit())
corrplot.mixed(M_hourly, order="AOE", number.cex = 0.5, tl.pos = "lt")

remaining_predictors <- setdiff(colnames(M_hourly), "rd_particle_pCi")
for (var_name in setdiff(colnames(M_hourly), "rd_particle_pCi")) {
# skip the variable if it already has been removed
if (!(var_name %in% remaining_predictors)){
next
}
# find variables above threshold and return their correlation with response
top_cor_vars <- correlated_above_threshold(M_hourly, var_name, 0.7, "rd_particle_pCi")
# only keep the variables that are still in remaining_predictors
top_cor_vars$variables <- intersect(top_cor_vars$variable, remaining_predictors)
top_cor_vars$scores <- top_cor_vars$scores[top_cor_vars$variable]
# if var_name is not correlated with any other variable above threshold
if (length(top_cor_vars$variables) == 1){
next
}
best <- names(which.max(abs(top_cor_vars$scores)))
to_remove <- setdiff(top_cor_vars$variables, best)
# print(cat("varname", var_name))
# print(cat("kicking out", to_remove))
# remove the to_remove variables from remaining_predictors
remaining_predictors <- setdiff(remaining_predictors, to_remove)
}
remaining_predictors
## [1] "co" "temp_f" "pressure_altcorr" "wsp"
## [5] "wdr" "relh" "rain" "solr"
## [9] "ch4" "ethene" "X1_3.butadiene" "isoprene"
## [13] "m.p.xylene" "hour" "weighted.count"
M_hourly_2 <- cor(loving_hourly_noflares %>% select("rd_particle_pCi", "co", "temp_f", "pressure_altcorr", "wsp", "wdr", "relh","rain", "solr","ch4", "ethene", "X1_3.butadiene", "isoprene" , "m.p.xylene", "hour", "weighted.count") %>% na.omit())
corrplot.mixed(M_hourly_2, number.cex = 0.5, tl.pos = "lt")

# fit a Gam on that
hourly_no_cor <- mgcv::gam(rd_particle_pCi ~ co + temp_f + pressure_altcorr + wsp + wdr + relh + rain + ch4 + ethene + X1_3.butadiene + isoprene + m.p.xylene + hour + weighted.count, data = loving_hourly_noflares)
summary(hourly_no_cor)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## rd_particle_pCi ~ co + temp_f + pressure_altcorr + wsp + wdr +
## relh + rain + ch4 + ethene + X1_3.butadiene + isoprene +
## m.p.xylene + hour + weighted.count
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.556e+00 5.016e-01 -7.090 1.67e-12 ***
## co 2.588e-04 4.173e-05 6.202 6.34e-10 ***
## temp_f 1.617e-04 1.713e-04 0.944 0.3451
## pressure_altcorr 3.458e-03 4.928e-04 7.017 2.79e-12 ***
## wsp -8.217e-03 9.680e-04 -8.489 < 2e-16 ***
## wdr 3.310e-04 2.302e-05 14.382 < 2e-16 ***
## relh -2.085e-04 1.091e-04 -1.911 0.0561 .
## rain -8.990e-02 8.033e-02 -1.119 0.2632
## ch4 7.715e-05 3.416e-06 22.585 < 2e-16 ***
## ethene 4.212e-03 2.752e-03 1.531 0.1260
## X1_3.butadiene 2.609e-01 1.255e-01 2.080 0.0376 *
## isoprene 1.388e-01 1.301e-01 1.068 0.2858
## m.p.xylene 3.009e-02 5.021e-03 5.994 2.30e-09 ***
## hour -2.975e-03 2.548e-04 -11.675 < 2e-16 ***
## weighted.count -1.211e-03 1.360e-03 -0.890 0.3734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.556 Deviance explained = 55.8%
## GCV = 0.007133 Scale est. = 0.0070973 n = 2991